import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np
np.random.seed(42)
pio.renderers.default = "notebook"Geographic Analysis
Introduction
North American Industry Classification System
## Listing Columns So We Can Reference them in Visuals
import pandas as pd
df = pd.read_csv("./data/lightcast_job_postings.csv")
df.head()/tmp/ipykernel_4047/736325668.py:4: DtypeWarning:
Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
/tmp/ipykernel_4047/736325668.py:4: DtypeWarning:
Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
| ID | LAST_UPDATED_DATE | LAST_UPDATED_TIMESTAMP | DUPLICATES | POSTED | EXPIRED | DURATION | SOURCE_TYPES | SOURCES | URL | ... | NAICS_2022_2 | NAICS_2022_2_NAME | NAICS_2022_3 | NAICS_2022_3_NAME | NAICS_2022_4 | NAICS_2022_4_NAME | NAICS_2022_5 | NAICS_2022_5_NAME | NAICS_2022_6 | NAICS_2022_6_NAME | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1f57d95acf4dc67ed2819eb12f049f6a5c11782c | 9/6/2024 | 2024-09-06 20:32:57.352 Z | 0.0 | 6/2/2024 | 6/8/2024 | 6.0 | [\n "Company"\n] | [\n "brassring.com"\n] | [\n "https://sjobs.brassring.com/TGnewUI/Sear... | ... | 44.0 | Retail Trade | 441.0 | Motor Vehicle and Parts Dealers | 4413.0 | Automotive Parts, Accessories, and Tire Retailers | 44133.0 | Automotive Parts and Accessories Retailers | 441330.0 | Automotive Parts and Accessories Retailers |
| 1 | 0cb072af26757b6c4ea9464472a50a443af681ac | 8/2/2024 | 2024-08-02 17:08:58.838 Z | 0.0 | 6/2/2024 | 8/1/2024 | NaN | [\n "Job Board"\n] | [\n "maine.gov"\n] | [\n "https://joblink.maine.gov/jobs/1085740"\n] | ... | 56.0 | Administrative and Support and Waste Managemen... | 561.0 | Administrative and Support Services | 5613.0 | Employment Services | 56132.0 | Temporary Help Services | 561320.0 | Temporary Help Services |
| 2 | 85318b12b3331fa490d32ad014379df01855c557 | 9/6/2024 | 2024-09-06 20:32:57.352 Z | 1.0 | 6/2/2024 | 7/7/2024 | 35.0 | [\n "Job Board"\n] | [\n "dejobs.org"\n] | [\n "https://dejobs.org/dallas-tx/data-analys... | ... | 52.0 | Finance and Insurance | 524.0 | Insurance Carriers and Related Activities | 5242.0 | Agencies, Brokerages, and Other Insurance Rela... | 52429.0 | Other Insurance Related Activities | 524291.0 | Claims Adjusting |
| 3 | 1b5c3941e54a1889ef4f8ae55b401a550708a310 | 9/6/2024 | 2024-09-06 20:32:57.352 Z | 1.0 | 6/2/2024 | 7/20/2024 | 48.0 | [\n "Job Board"\n] | [\n "disabledperson.com",\n "dejobs.org"\n] | [\n "https://www.disabledperson.com/jobs/5948... | ... | 52.0 | Finance and Insurance | 522.0 | Credit Intermediation and Related Activities | 5221.0 | Depository Credit Intermediation | 52211.0 | Commercial Banking | 522110.0 | Commercial Banking |
| 4 | cb5ca25f02bdf25c13edfede7931508bfd9e858f | 6/19/2024 | 2024-06-19 07:00:00.000 Z | 0.0 | 6/2/2024 | 6/17/2024 | 15.0 | [\n "FreeJobBoard"\n] | [\n "craigslist.org"\n] | [\n "https://modesto.craigslist.org/sls/77475... | ... | 99.0 | Unclassified Industry | 999.0 | Unclassified Industry | 9999.0 | Unclassified Industry | 99999.0 | Unclassified Industry | 999999.0 | Unclassified Industry |
5 rows × 131 columns
# Filter for Boston, MA and Austin, TX
selected_state = ['California', 'Florida', 'Massachusetts', 'Texas', 'New York']
filtered_df = df[df['STATE_NAME'].isin(selected_state)]
# Further filter for NAICS_2022_6 = 518210 and show relevant columns
final_df = filtered_df[filtered_df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
final_df[['STATE_NAME', 'NAICS2_NAME', 'NAICS_2022_6', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)| STATE_NAME | NAICS2_NAME | NAICS_2022_6 | LOT_SPECIALIZED_OCCUPATION_NAME | |
|---|---|---|---|---|
| 2 | Texas | Finance and Insurance | 524291.0 | Data Analyst |
| 4 | California | Unclassified Industry | 999999.0 | Oracle Consultant / Analyst |
| 9 | New York | Professional, Scientific, and Technical Services | 541511.0 | Data Analyst |
| 10 | California | Wholesale Trade | 423830.0 | Data Analyst |
| 15 | Massachusetts | Educational Services | 611310.0 | Data Analyst |
| ... | ... | ... | ... | ... |
| 294 | Florida | Educational Services | 611310.0 | SAP Analyst / Admin |
| 295 | California | Finance and Insurance | 524114.0 | Data Analyst |
| 296 | New York | Unclassified Industry | 999999.0 | General ERP Analyst / Consultant |
| 297 | Texas | Professional, Scientific, and Technical Services | 541611.0 | SAP Analyst / Admin |
| 299 | Texas | Professional, Scientific, and Technical Services | 541511.0 | General ERP Analyst / Consultant |
100 rows × 4 columns
import matplotlib.pyplot as plt
import numpy as np
# Group by STATE_NAME and count jobs for NAICS_2022_6 = 518210
state_counts_jobs = final_df.groupby('STATE_NAME').size().reset_index(name='job_count')
# Sort state_counts_jobs from greatest to least by job_count
state_counts_jobs_sorted = state_counts_jobs.sort_values(by='job_count', ascending=False)
# Plot column chart
plt.figure(figsize=(8, 5))
colors = plt.cm.coolwarm(np.linspace(0, 1, len(state_counts_jobs_sorted)))
plt.bar(state_counts_jobs_sorted['STATE_NAME'], state_counts_jobs_sorted['job_count'], color=colors)
plt.xlabel('State')
plt.ylabel('Number of Jobs')
plt.title('Tech Jobs by State (Job Title Contains "Analyst")')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures/states_analyst_jobs.png', dpi=300, bbox_inches='tight')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# Group by STATE_NAME and count jobs for analysts
city_counts_jobs = final_df.groupby('CITY_NAME').size().reset_index(name='job_count')
# Sort state_counts_jobs from greatest to least by job_count and get top 10
city_counts_jobs_sorted = city_counts_jobs.sort_values(by='job_count', ascending=False).head(10)
# Plot column chart
plt.figure(figsize=(10, 6))
colors = plt.cm.coolwarm(np.linspace(0, 1, len(city_counts_jobs_sorted)))
plt.bar(city_counts_jobs_sorted['CITY_NAME'], city_counts_jobs_sorted['job_count'], color=colors)
plt.xlabel('City')
plt.ylabel('Number of Jobs')
plt.title('Top 10 Cities - Tech Jobs (Job Title Contains "Analyst")')
plt.xticks(rotation=45)
plt.tight_layout()
# Save the figure to the figures folder
plt.savefig('figures/top_10_cities_analyst_jobs.png', dpi=300, bbox_inches='tight')
plt.show()
import plotly.express as px
import plotly.io as pio
# Create nationwide data - remove state filtering to include all states
try:
# Use the original df (before state filtering) to get all states
all_states_df = df[df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
except NameError:
# If df not available, load it fresh
import pandas as pd
df = pd.read_csv("./data/lightcast_job_postings.csv")
all_states_df = df[df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
# Group by all states and count jobs
all_state_counts = all_states_df.groupby('STATE_NAME').size().reset_index(name='job_count')
all_state_counts_sorted = all_state_counts.sort_values(by='job_count', ascending=False)
print("Top 10 states with most analyst jobs:")
print(all_state_counts_sorted.head(10))
print(f"\nTotal states with analyst jobs: {len(all_state_counts_sorted)}")
# Comprehensive state abbreviation mapping
state_abbrev_map = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
'District of Columbia': 'DC'
}
# Add state abbreviations to the data
all_state_counts_sorted['state_abbrev'] = all_state_counts_sorted['STATE_NAME'].map(state_abbrev_map)
# Filter out any states that couldn't be mapped (in case of data issues)
mapped_states = all_state_counts_sorted.dropna(subset=['state_abbrev'])
print(f"\nStates successfully mapped: {len(mapped_states)}")
if len(mapped_states) < len(all_state_counts_sorted):
unmapped = all_state_counts_sorted[all_state_counts_sorted['state_abbrev'].isna()]
print("Unmapped states:")
print(unmapped['STATE_NAME'].tolist())
# Create a choropleth map showing job counts for all states
fig = px.choropleth(
mapped_states,
locations='state_abbrev',
color='job_count',
locationmode='USA-states',
color_continuous_scale='Greens',
labels={'job_count': 'Number of Jobs', 'STATE_NAME': 'State'},
hover_name='STATE_NAME',
hover_data={'state_abbrev': False, 'job_count': True}
)
# Update layout with no title to maximize map space
fig.update_layout(
geo_scope='usa',
width=1000,
height=700,
margin=dict(t=5, b=5, l=5, r=5), # Minimal margins on all sides
geo=dict(
projection_type='albers usa',
showlakes=True,
lakecolor='rgb(255, 255, 255)',
bgcolor='rgba(0,0,0,0)'
)
)
# Show the interactive map
fig.show()Top 10 states with most analyst jobs:
STATE_NAME job_count
42 Texas 7084
4 California 6501
8 Florida 3206
31 New York 3056
12 Illinois 3045
45 Virginia 2989
32 North Carolina 2423
34 Ohio 2403
9 Georgia 2363
29 New Jersey 2289
Total states with analyst jobs: 51
States successfully mapped: 50
Unmapped states:
['Washington, D.C. (District of Columbia)']
Top 10 states with most analyst jobs:
STATE_NAME job_count
42 Texas 7084
4 California 6501
8 Florida 3206
31 New York 3056
12 Illinois 3045
45 Virginia 2989
32 North Carolina 2423
34 Ohio 2403
9 Georgia 2363
29 New Jersey 2289
Total states with analyst jobs: 51
States successfully mapped: 50
Unmapped states:
['Washington, D.C. (District of Columbia)']
Tech Jobs Nationwide (Job Title Contains “Analyst”)
Interactive map showing the distribution of analyst jobs across all US states.
# filter df for NAICS_2022_2 is 44 and is 51
# df_41 = df[df['NAICS_2022_2'].isin(['44'])]
#
# df_41.head()selected_naics = [11, 21, 22]
filtered_df = df[df['NAICS_2022_2'].isin(selected_naics)]
# Further filter for NAICS_2022_6 = 518210 and show relevant columns
# final_df = filtered_df[filtered_df['NAICS_2022_2'].str.contains('analyst', case=False, na=False)]
filtered_df[['STATE_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)| STATE_NAME | NAICS_2022_2 | NAICS_2022_2_NAME | LOT_SPECIALIZED_OCCUPATION_NAME | |
|---|---|---|---|---|
| 376 | North Carolina | 21.0 | Mining, Quarrying, and Oil and Gas Extraction | Data Analyst |
| 394 | North Carolina | 22.0 | Utilities | General ERP Analyst / Consultant |
| 502 | California | 22.0 | Utilities | Business Analyst (General) |
| 525 | Nebraska | 22.0 | Utilities | Business Analyst (General) |
| 632 | Massachusetts | 22.0 | Utilities | Business Analyst (General) |
| ... | ... | ... | ... | ... |
| 8531 | Michigan | 21.0 | Mining, Quarrying, and Oil and Gas Extraction | Data Analyst |
| 8553 | Texas | 22.0 | Utilities | Enterprise Architect |
| 8682 | Florida | 11.0 | Agriculture, Forestry, Fishing and Hunting | General ERP Analyst / Consultant |
| 8698 | Colorado | 22.0 | Utilities | Enterprise Architect |
| 8702 | Oregon | 22.0 | Utilities | Enterprise Architect |
100 rows × 4 columns
import plotly.express as px
analyst_df = df[df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
analyst_salary_df = analyst_df[analyst_df['SALARY'].notna()]
print(f"Total analyst jobs: {len(analyst_df)}")
print(f"Analyst jobs with salary data: {len(analyst_salary_df)}")
# Group by state and calculate metrics
bubble_data = analyst_salary_df.groupby('STATE_NAME').agg({
'SALARY': 'mean', # Average salary for bubble size
'STATE_NAME': 'count' # Count of jobs for y-axis
}).rename(columns={'STATE_NAME': 'job_count'})
bubble_data = bubble_data.reset_index()
print(f"\nStates with analyst salary data: {len(bubble_data)}")
print("\nTop 10 states by job count:")
print(bubble_data.sort_values('job_count', ascending=False).head(10))
# Filter to top 10 states by job count
top_10_states = bubble_data.sort_values('job_count', ascending=False).head(10)
# Create the bubble chart
fig = px.scatter(
top_10_states,
x='STATE_NAME',
y='job_count',
size='SALARY',
color='SALARY',
hover_name='STATE_NAME',
hover_data={
'SALARY': ':,.0f',
'job_count': True,
'STATE_NAME': False
},
labels={
'SALARY': 'Average Salary ($)',
'STATE_NAME': 'State',
'job_count': 'Number of Jobs (Excludes Null Salaries)'
},
title='Top 10 States for Analyst Jobs (Bubble Size = Salary)',
color_continuous_scale='Greens'
)
# Customize the layout
fig.update_layout(
width=1200,
height=700,
xaxis_tickangle=-45,
showlegend=True
)
# Update traces for better bubble appearance
fig.update_traces(
marker=dict(
sizemode='diameter',
sizeref=2.0*max(top_10_states['SALARY'])/(15.**2),
sizemin=4,
line=dict(width=1, color='white')
)
)
fig.show()